QDA

Author

Lizzie Healy

Published

April 17, 2025

Quadratic Discriminant Analysis

ALL CURREL CATEGORIES

Reading in the Data

Code
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, RocCurveDisplay
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.utils import compute_class_weight
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
import warnings

# dealing with an SkLearn deprecated warning
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

# reading in data
religion = pd.read_csv("../data/religion_data_no99.csv")

# Christian, non-christian, unaffiliated
grouping_map = {
    1000: 'Protestant',
    10000: 'Catholic',
    20000: 'Mormon',
    30000: 'Orthodox Christian',
    40001: 'Jehovahs Witness',
    40002: 'Other Christian',
    50000: 'Jewish',
    60000: 'Muslim',
    70000: 'Buddhist',
    80000: 'Hindu',
    90001: 'Other world Religions',
    90002: 'Other faiths',
    100000: 'Unaffiliated'
}

religion['CURREL_NEW'] = religion['CURREL'].map(grouping_map)
Code
# getting the x and y variables
X_int = religion.drop(columns=['RELTRAD', 'CURREL_NEW'])

# take currel
y = religion['CURREL_NEW']

# drop some rows for y
print(y.value_counts())

# checking shapes
print(y.shape)
print(X_int.shape)
CURREL_NEW
Protestant            8723
Unaffiliated          7355
Catholic              4074
Jewish                 521
Mormon                 362
Buddhist               232
Muslim                 167
Hindu                  164
Orthodox Christian     132
Jehovahs Witness        34
Name: count, dtype: int64
(21764,)
(21764, 100)

Preprocessing

Code
# scaling X value
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_int)

# split data into test, train, validation
X_tmp, X_test, y_tmp, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=6600)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=6600)

# print
print("\nTRAIN")
print(X_train.shape)
print(y_train.shape)

print("\nVALIDATION")
print(X_val.shape)
print(y_val.shape)

print("\nTEST")
print(X_test.shape)
print(y_test.shape)

TRAIN
(13928, 100)
(13928,)

VALIDATION
(3483, 100)
(3483,)

TEST
(4353, 100)
(4353,)

Dealing with the Class Imbalances

Code
# checking the imbalance
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# adding re-sampling to deal with class imabalance
smote = SMOTE(random_state=6600)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

Hyperparameter Tuning (reg_param)

Code
# initiating parameters
best_val = 0
opt_reg = None
val_scores = {}
reg_params = [0.0, 0.05, 0.1, 0.2, 0.5, 0.9]

for r in reg_params:
    qda_model = QuadraticDiscriminantAnalysis(reg_param=0.2)
    qda_model.fit(X_train_bal, y_train_bal)

    # getting the predictions
    y_val_pred = qda_model.predict(X_val)
    val_score = accuracy_score(y_val, y_val_pred)
    val_scores[r] = val_score

    # updating best reg value
    if val_score > best_val:
        best_val = val_score
        opt_reg = r

print("\nOptimal reg_param:", opt_reg)
print("Validation accuracy:", best_val)

# plotting
plt.plot(val_scores.keys(), val_scores.values(), marker='o')
plt.title("Validation Accuracy vs reg_param")
plt.xlabel("reg_param")
plt.ylabel("Validation Accuracy")
plt.grid(True)
plt.show()

Optimal reg_param: 0.0
Validation accuracy: 0.7203560149296584

Fitting the Model

Code
opt_reg = 0.2
qda_model = QuadraticDiscriminantAnalysis(reg_param=opt_reg) # iniating QDA
qda_model.fit(X_train_bal, y_train_bal)

# getting the predictions
y_train_pred = qda_model.predict(X_train)
y_test_pred = qda_model.predict(X_test)
y_val_pred = qda_model.predict(X_val)

Visualizing Results

Code
# classification report
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred, zero_division=0))
print(accuracy_score(y_test, y_test_pred))
print("-----------------------")

# confusion matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_test_pred))
print("-----------------------")

# ROC curve
print("ROC CURVES:")
classes = qda_model.classes_ # getting classes
y_score = qda_model.predict_proba(X_test) # predictions
y_onehot = label_binarize(y_test, classes=classes)
for i, label in enumerate(classes): # plotting ROC for all classes of digits
    auc = roc_auc_score(y_onehot[:, i], y_score[:, i])
    display = RocCurveDisplay.from_predictions( # ROC
        y_true=y_onehot[:, i],
        y_pred=y_score[:, i],
        name=f"Religion {label} vs the rest",
        color="darkorange",
        plot_chance_level=True,
        despine=True,
        )
    _ = display.ax_.set(
        xlabel="False Positive Rate",
        ylabel="True Positive Rate"
    )
plt.show()
CLASSIFICATION REPORT:
                    precision    recall  f1-score   support

          Buddhist       0.21      0.17      0.19        48
          Catholic       0.58      0.67      0.62       824
             Hindu       0.64      0.38      0.47        37
  Jehovahs Witness       0.00      0.00      0.00         9
            Jewish       0.23      0.62      0.33        98
            Mormon       0.14      0.67      0.24        58
            Muslim       0.47      0.42      0.44        19
Orthodox Christian       0.05      0.05      0.05        20
        Protestant       0.82      0.63      0.71      1752
      Unaffiliated       0.94      0.90      0.92      1488

          accuracy                           0.72      4353
         macro avg       0.41      0.45      0.40      4353
      weighted avg       0.78      0.72      0.74      4353

0.7176659774867907
-----------------------
CONFUSION MATRIX:
[[   8    5    1    0    6    1    2    0    2   23]
 [   1  548    3    0   45   41    2    4  163   17]
 [   2    3   14    0    2    0    3    0    0   13]
 [   0    2    0    0    0    0    0    0    7    0]
 [   2   12    0    0   61    0    0    0   11   12]
 [   0    2    0    0    1   39    0    0   16    0]
 [   0    2    0    0    1    0    8    1    4    3]
 [   1    7    0    0    1    0    0    1    9    1]
 [   4  352    2    0   75  190    1   15 1102   11]
 [  20   20    2    0   77    1    1    1   23 1343]]
-----------------------
ROC CURVES:

CHRISTIAN VS NON-CHRISTIAN VS UNAFFILIATED

Reading in the Data

Code
# reading in data
religion = pd.read_csv("../data/religion_full_currel.csv")

Dealing with Class Imbalance

Code
# visualizing imbalance
y = religion['CURREL']
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# dropping refused 
religion = religion[religion['CURREL'] != 900000]

# Christian, non-christian, unaffiliated
grouping_map = {
    1000: 'Christian',
    10000: 'Christian',
    20000: 'Christian',
    30000: 'Christian',
    40001: 'Christian',
    40002: 'Christian',
    50000: 'Non-Christian',
    60000: 'Non-Christian',
    70000: 'Non-Christian',
    80000: 'Non-Christian',
    90001: 'Non-Christian',
    90002: 'Non-Christian',
    100000: 'Unaffiliated'
}

religion['CURREL_NEW'] = religion['CURREL'].map(grouping_map)

Code
# getting the x and y variables
X_int = religion.drop(columns=['RELTRAD', 'CURREL_NEW'])

# take currel
y = religion['CURREL_NEW']

# drop some rows for y
print(y.value_counts())

# checking shapes
print(y.shape)
print(X_int.shape)
CURREL_NEW
Christian        13442
Unaffiliated      7355
Non-Christian     1651
Name: count, dtype: int64
(22448,)
(22448, 100)

Preprocessing

Code
# scaling X value
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_int)

# split data into test, train, validation
X_tmp, X_test, y_tmp, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=6600)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=6600)

# print
print("\nTRAIN")
print(X_train.shape)
print(y_train.shape)

print("\nVALIDATION")
print(X_val.shape)
print(y_val.shape)

print("\nTEST")
print(X_test.shape)
print(y_test.shape)

TRAIN
(14366, 100)
(14366,)

VALIDATION
(3592, 100)
(3592,)

TEST
(4490, 100)
(4490,)

Dealing with the Class Imbalances

Code
# checking the imbalance
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# adding re-sampling to deal with class imabalance
smote = SMOTE(random_state=6600)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

Hyperparameter Tuning (reg_param)

Code
# initiating parameters
best_val = 0
opt_reg = None
val_scores = {}
reg_params = [0.0, 0.05, 0.1, 0.2, 0.5, 0.9]

for r in reg_params:
    qda_model = QuadraticDiscriminantAnalysis(reg_param=0.2)
    qda_model.fit(X_train_bal, y_train_bal)

    # getting the predictions
    y_val_pred = qda_model.predict(X_val)
    val_score = accuracy_score(y_val, y_val_pred)
    val_scores[r] = val_score

    # updating best reg value
    if val_score > best_val:
        best_val = val_score
        opt_reg = r

print("\nOptimal reg_param:", opt_reg)
print("Validation accuracy:", best_val)

# plotting
plt.plot(val_scores.keys(), val_scores.values(), marker='o')
plt.title("Validation Accuracy vs reg_param")
plt.xlabel("reg_param")
plt.ylabel("Validation Accuracy")
plt.grid(True)
plt.show()

Optimal reg_param: 0.0
Validation accuracy: 0.8933741648106904

Fitting the Model

Code
opt_reg = 0.2
qda_model = QuadraticDiscriminantAnalysis(reg_param=opt_reg) # iniating QDA
qda_model.fit(X_train_bal, y_train_bal)

# getting the predictions
y_train_pred = qda_model.predict(X_train)
y_test_pred = qda_model.predict(X_test)
y_val_pred = qda_model.predict(X_val)

Visualizing Results

Code
# classification report
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred, zero_division=0))
print(accuracy_score(y_test, y_test_pred))
print("-----------------------")

# confusion matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_test_pred))
print("-----------------------")

# ROC curve
print("ROC CURVES:")
classes = qda_model.classes_ # getting classes
y_score = qda_model.predict_proba(X_test) # predictions
y_onehot = label_binarize(y_test, classes=classes)
for i, label in enumerate(classes): # plotting ROC for all classes of digits
    auc = roc_auc_score(y_onehot[:, i], y_score[:, i])
    display = RocCurveDisplay.from_predictions( # ROC
        y_true=y_onehot[:, i],
        y_pred=y_score[:, i],
        name=f"Religion {label} vs the rest",
        color="darkorange",
        plot_chance_level=True,
        despine=True,
        )
    _ = display.ax_.set(
        xlabel="False Positive Rate",
        ylabel="True Positive Rate"
    )
plt.show()
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

    Christian       0.97      0.92      0.95      2724
Non-Christian       0.38      0.57      0.46       317
 Unaffiliated       0.91      0.89      0.90      1449

     accuracy                           0.89      4490
    macro avg       0.75      0.80      0.77      4490
 weighted avg       0.91      0.89      0.90      4490

0.889532293986637
-----------------------
CONFUSION MATRIX:
[[2519  175   30]
 [  37  181   99]
 [  36  119 1294]]
-----------------------
ROC CURVES:

PROTESTANT, CATHOLIC, MORMON, CHRISTIAN, JEWISH, MUSLIM, OTHER, UNAFFALIATED

Reading in the Data

Code
# reading in data
religion = pd.read_csv("../data/religion_full_currel.csv")

Dealing with Class Imbalance

Code
# visualizing imbalance
y = religion['CURREL']
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# dropping refused 
religion = religion[religion['CURREL'] != 900000]

# Christian, non-christian, unaffiliated
grouping_map = {
    1000: 'Protestant',
    10000: 'Catholic',
    20000: 'Mormon',
    30000: 'Christian',
    40001: 'Christian',
    40002: 'Christian',
    50000: 'Jewish',
    60000: 'Muslim',
    70000: 'Other Religion',
    80000: 'Other Religion',
    90001: 'Other Religion',
    90002: 'Other Religion',
    100000: 'Unaffiliated'
}

religion['CURREL_NEW'] = religion['CURREL'].map(grouping_map)

Code
# getting the x and y variables
X_int = religion.drop(columns=['RELTRAD', 'CURREL_NEW'])

# take currel
y = religion['CURREL_NEW']

# drop some rows for y
print(y.value_counts())

# checking shapes
print(y.shape)
print(X_int.shape)
CURREL_NEW
Protestant        8723
Unaffiliated      7355
Catholic          4074
Other Religion     963
Jewish             521
Mormon             362
Christian          283
Muslim             167
Name: count, dtype: int64
(22448,)
(22448, 100)

Preprocessing

Code
# scaling X value
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_int)

# split data into test, train, validation
X_tmp, X_test, y_tmp, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=6600)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=6600)

# print
print("\nTRAIN")
print(X_train.shape)
print(y_train.shape)

print("\nVALIDATION")
print(X_val.shape)
print(y_val.shape)

print("\nTEST")
print(X_test.shape)
print(y_test.shape)

TRAIN
(14366, 100)
(14366,)

VALIDATION
(3592, 100)
(3592,)

TEST
(4490, 100)
(4490,)

Dealing with the Class Imbalances

Code
# checking the imbalance
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# adding re-sampling to deal with class imabalance
smote = SMOTE(random_state=6600)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

Hyperparameter Tuning (reg_param)

Code
# initiating parameters
best_val = 0
opt_reg = None
val_scores = {}
reg_params = [0.0, 0.05, 0.1, 0.2, 0.5, 0.9]

for r in reg_params:
    qda_model = QuadraticDiscriminantAnalysis(reg_param=0.2)
    qda_model.fit(X_train_bal, y_train_bal)

    # getting the predictions
    y_val_pred = qda_model.predict(X_val)
    val_score = accuracy_score(y_val, y_val_pred)
    val_scores[r] = val_score

    # updating best reg value
    if val_score > best_val:
        best_val = val_score
        opt_reg = r

print("\nOptimal reg_param:", opt_reg)
print("Validation accuracy:", best_val)

# plotting
plt.plot(val_scores.keys(), val_scores.values(), marker='o')
plt.title("Validation Accuracy vs reg_param")
plt.xlabel("reg_param")
plt.ylabel("Validation Accuracy")
plt.grid(True)
plt.show()

Optimal reg_param: 0.0
Validation accuracy: 0.6909799554565702

Fitting the Model

Code
opt_reg = 0.2
qda_model = QuadraticDiscriminantAnalysis(reg_param=opt_reg) # iniating QDA
qda_model.fit(X_train_bal, y_train_bal)

# getting the predictions
y_train_pred = qda_model.predict(X_train)
y_test_pred = qda_model.predict(X_test)
y_val_pred = qda_model.predict(X_val)

Visualizing Results

Code
# classification report
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred, zero_division=0))
print(accuracy_score(y_test, y_test_pred))
print("-----------------------")

# confusion matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_test_pred))
print("-----------------------")

# ROC curve
print("ROC CURVES:")
classes = qda_model.classes_ # getting classes
y_score = qda_model.predict_proba(X_test) # predictions
y_onehot = label_binarize(y_test, classes=classes)
for i, label in enumerate(classes): # plotting ROC for all classes of digits
    auc = roc_auc_score(y_onehot[:, i], y_score[:, i])
    display = RocCurveDisplay.from_predictions( # ROC
        y_true=y_onehot[:, i],
        y_pred=y_score[:, i],
        name=f"Religion {label} vs the rest",
        color="darkorange",
        plot_chance_level=True,
        despine=True,
        )
    _ = display.ax_.set(
        xlabel="False Positive Rate",
        ylabel="True Positive Rate"
    )
plt.show()
CLASSIFICATION REPORT:
                precision    recall  f1-score   support

      Catholic       0.56      0.64      0.60       815
     Christian       0.14      0.16      0.15        63
        Jewish       0.23      0.50      0.32       107
        Mormon       0.18      0.65      0.29        69
        Muslim       0.53      0.22      0.31        37
Other Religion       0.29      0.48      0.36       173
    Protestant       0.83      0.65      0.73      1777
  Unaffiliated       0.91      0.82      0.86      1449

      accuracy                           0.68      4490
     macro avg       0.46      0.52      0.45      4490
  weighted avg       0.75      0.68      0.71      4490

0.6841870824053452
-----------------------
CONFUSION MATRIX:
[[ 525   18   43   39    0   13  166   11]
 [  14   10    4    1    1    8   22    3]
 [  18    1   54    1    0    5    7   21]
 [  12    0    0   45    0    0   11    1]
 [   6    2    1    0    8   12    6    2]
 [   5    1    8    0    4   83    7   65]
 [ 341   31   71  159    1    6 1160    8]
 [  19    8   51    0    1  159   24 1187]]
-----------------------
ROC CURVES:

USING THE SELECTED FEATURES FOR ALL CURREL

Reading in the Data

Code
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, RocCurveDisplay
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.utils import compute_class_weight
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
import warnings

# dealing with an SkLearn deprecated warning
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

# reading in data
religion = pd.read_csv("../data/religion_data_no99.csv")

# Christian, non-christian, unaffiliated
grouping_map = {
    1000: 'Protestant',
    10000: 'Catholic',
    20000: 'Mormon',
    30000: 'Orthodox Christian',
    40001: 'Jehovahs Witness',
    40002: 'Other Christian',
    50000: 'Jewish',
    60000: 'Muslim',
    70000: 'Buddhist',
    80000: 'Hindu',
    90001: 'Other world Religions',
    90002: 'Other faiths',
    100000: 'Unaffiliated'
}

religion['CURREL_NEW'] = religion['CURREL'].map(grouping_map)
Code
# getting the x and y variables
X_int = religion.drop(columns=['RELTRAD', 'CURREL_NEW'])

# take currel
y = religion['CURREL_NEW']

# drop some rows for y
print(y.value_counts())

# checking shapes
print(y.shape)
print(X_int.shape)
CURREL_NEW
Protestant            8723
Unaffiliated          7355
Catholic              4074
Jewish                 521
Mormon                 362
Buddhist               232
Muslim                 167
Hindu                  164
Orthodox Christian     132
Jehovahs Witness        34
Name: count, dtype: int64
(21764,)
(21764, 100)

Taking just the selected

Code
selected_features = [0, 7, 8, 10, 11, 13, 14, 16, 17, 19, 21, 22, 24, 25, 26, 27, 29, 30, 32, 33, 35, 36, 37, 38, 44, 47, 50, 53, 54, 57, 60, 62, 63, 64, 65, 70, 71, 74, 75, 77, 80, 81, 82, 83, 85, 86, 88, 90, 91, 92]

X_int = X_int.iloc[:, selected_features]
print(X_int.shape)
(21764, 50)

Preprocessing

Code
# scaling X value
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_int)

# split data into test, train, validation
X_tmp, X_test, y_tmp, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=6600)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=6600)

# print
print("\nTRAIN")
print(X_train.shape)
print(y_train.shape)

print("\nVALIDATION")
print(X_val.shape)
print(y_val.shape)

print("\nTEST")
print(X_test.shape)
print(y_test.shape)

TRAIN
(13928, 50)
(13928,)

VALIDATION
(3483, 50)
(3483,)

TEST
(4353, 50)
(4353,)

Dealing with the Class Imbalances

Code
# checking the imbalance
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# adding re-sampling to deal with class imabalance
smote = SMOTE(random_state=6600)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

Hyperparameter Tuning (reg_param)

Code
# initiating parameters
best_val = 0
opt_reg = None
val_scores = {}
reg_params = [0.0, 0.05, 0.1, 0.2, 0.5, 0.9]

for r in reg_params:
    qda_model = QuadraticDiscriminantAnalysis(reg_param=0.2)
    qda_model.fit(X_train_bal, y_train_bal)

    # getting the predictions
    y_val_pred = qda_model.predict(X_val)
    val_score = accuracy_score(y_val, y_val_pred)
    val_scores[r] = val_score

    # updating best reg value
    if val_score > best_val:
        best_val = val_score
        opt_reg = r

print("\nOptimal reg_param:", opt_reg)
print("Validation accuracy:", best_val)

# plotting
plt.plot(val_scores.keys(), val_scores.values(), marker='o')
plt.title("Validation Accuracy vs reg_param")
plt.xlabel("reg_param")
plt.ylabel("Validation Accuracy")
plt.grid(True)
plt.show()

Optimal reg_param: 0.0
Validation accuracy: 0.6715475165087568

Fitting the Model

Code
opt_reg = 0.2
qda_model = QuadraticDiscriminantAnalysis(reg_param=opt_reg) # iniating QDA
qda_model.fit(X_train_bal, y_train_bal)

# getting the predictions
y_train_pred = qda_model.predict(X_train)
y_test_pred = qda_model.predict(X_test)
y_val_pred = qda_model.predict(X_val)

Visualizing Results

Code
# classification report
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred, zero_division=0))
print(accuracy_score(y_test, y_test_pred))
print("-----------------------")

# confusion matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_test_pred))
print("-----------------------")

# ROC curve
print("ROC CURVES:")
classes = qda_model.classes_ # getting classes
y_score = qda_model.predict_proba(X_test) # predictions
y_onehot = label_binarize(y_test, classes=classes)
for i, label in enumerate(classes): # plotting ROC for all classes of digits
    auc = roc_auc_score(y_onehot[:, i], y_score[:, i])
    display = RocCurveDisplay.from_predictions( # ROC
        y_true=y_onehot[:, i],
        y_pred=y_score[:, i],
        name=f"Religion {label} vs the rest",
        color="darkorange",
        plot_chance_level=True,
        despine=True,
        )
    _ = display.ax_.set(
        xlabel="False Positive Rate",
        ylabel="True Positive Rate"
    )
plt.show()
CLASSIFICATION REPORT:
                    precision    recall  f1-score   support

          Buddhist       0.16      0.38      0.23        48
          Catholic       0.61      0.59      0.60       824
             Hindu       0.30      0.57      0.39        37
  Jehovahs Witness       0.67      0.22      0.33         9
            Jewish       0.26      0.61      0.36        98
            Mormon       0.09      0.69      0.15        58
            Muslim       0.23      0.47      0.31        19
Orthodox Christian       0.03      0.25      0.05        20
        Protestant       0.84      0.54      0.66      1752
      Unaffiliated       0.98      0.87      0.92      1488

          accuracy                           0.66      4353
         macro avg       0.42      0.52      0.40      4353
      weighted avg       0.80      0.66      0.71      4353

0.6639099471628762
-----------------------
CONFUSION MATRIX:
[[  18    2    6    0    6    0    4    0    2   10]
 [   9  486    2    0   54   76    4   44  143    6]
 [   2    2   21    0    5    0    3    0    0    4]
 [   0    2    0    2    0    1    0    0    4    0]
 [  10    7    3    0   60    3    1    4    4    6]
 [   0    1    0    0    0   40    0    3   14    0]
 [   0    1    1    0    1    3    9    3    0    1]
 [   1    2    0    0    3    3    0    5    6    0]
 [   5  278    1    0   69  337    2  104  952    4]
 [  67   18   37    1   36    5   16    5    6 1297]]
-----------------------
ROC CURVES:

USING SELECTED FOR CHRISTIAN VS NON-CHRISTIAN VS UNAFFILIATED

Reading in the Data

Code
# reading in data
religion = pd.read_csv("../data/religion_full_currel.csv")

Dealing with Class Imbalance

Code
# visualizing imbalance
y = religion['CURREL']
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# dropping refused 
religion = religion[religion['CURREL'] != 900000]

# Christian, non-christian, unaffiliated
grouping_map = {
    1000: 'Christian',
    10000: 'Christian',
    20000: 'Christian',
    30000: 'Christian',
    40001: 'Christian',
    40002: 'Christian',
    50000: 'Non-Christian',
    60000: 'Non-Christian',
    70000: 'Non-Christian',
    80000: 'Non-Christian',
    90001: 'Non-Christian',
    90002: 'Non-Christian',
    100000: 'Unaffiliated'
}

religion['CURREL_NEW'] = religion['CURREL'].map(grouping_map)

Code
# getting the x and y variables
X_int = religion.drop(columns=['RELTRAD', 'CURREL_NEW'])

# take currel
y = religion['CURREL_NEW']

# drop some rows for y
print(y.value_counts())

# checking shapes
print(y.shape)
print(X_int.shape)
CURREL_NEW
Christian        13442
Unaffiliated      7355
Non-Christian     1651
Name: count, dtype: int64
(22448,)
(22448, 100)
Code
selected_features = [0, 7, 8, 10, 11, 13, 14, 16, 17, 19, 21, 22, 24, 25, 26, 27, 29, 30, 32, 33, 35, 36, 37, 38, 44, 47, 50, 53, 54, 57, 60, 62, 63, 64, 65, 70, 71, 74, 75, 77, 80, 81, 82, 83, 85, 86, 88, 90, 91, 92]

X_int = X_int.iloc[:, selected_features]
print(X_int.shape)
(22448, 50)

Preprocessing

Code
# scaling X value
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_int)

# split data into test, train, validation
X_tmp, X_test, y_tmp, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=6600)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=6600)

# print
print("\nTRAIN")
print(X_train.shape)
print(y_train.shape)

print("\nVALIDATION")
print(X_val.shape)
print(y_val.shape)

print("\nTEST")
print(X_test.shape)
print(y_test.shape)

TRAIN
(14366, 50)
(14366,)

VALIDATION
(3592, 50)
(3592,)

TEST
(4490, 50)
(4490,)

Dealing with the Class Imbalances

Code
# checking the imbalance
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# adding re-sampling to deal with class imabalance
smote = SMOTE(random_state=6600)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

Hyperparameter Tuning (reg_param)

Code
# initiating parameters
best_val = 0
opt_reg = None
val_scores = {}
reg_params = [0.0, 0.05, 0.1, 0.2, 0.5, 0.9]

for r in reg_params:
    qda_model = QuadraticDiscriminantAnalysis(reg_param=0.2)
    qda_model.fit(X_train_bal, y_train_bal)

    # getting the predictions
    y_val_pred = qda_model.predict(X_val)
    val_score = accuracy_score(y_val, y_val_pred)
    val_scores[r] = val_score

    # updating best reg value
    if val_score > best_val:
        best_val = val_score
        opt_reg = r

print("\nOptimal reg_param:", opt_reg)
print("Validation accuracy:", best_val)

# plotting
plt.plot(val_scores.keys(), val_scores.values(), marker='o')
plt.title("Validation Accuracy vs reg_param")
plt.xlabel("reg_param")
plt.ylabel("Validation Accuracy")
plt.grid(True)
plt.show()

Optimal reg_param: 0.0
Validation accuracy: 0.9106347438752784

Fitting the Model

Code
opt_reg = 0.2
qda_model = QuadraticDiscriminantAnalysis(reg_param=opt_reg) # iniating QDA
qda_model.fit(X_train_bal, y_train_bal)

# getting the predictions
y_train_pred = qda_model.predict(X_train)
y_test_pred = qda_model.predict(X_test)
y_val_pred = qda_model.predict(X_val)

Visualizing Results

Code
# classification report
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred, zero_division=0))
print(accuracy_score(y_test, y_test_pred))
print("-----------------------")

# confusion matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_test_pred))
print("-----------------------")

# ROC curve
print("ROC CURVES:")
classes = qda_model.classes_ # getting classes
y_score = qda_model.predict_proba(X_test) # predictions
y_onehot = label_binarize(y_test, classes=classes)
for i, label in enumerate(classes): # plotting ROC for all classes of digits
    auc = roc_auc_score(y_onehot[:, i], y_score[:, i])
    display = RocCurveDisplay.from_predictions( # ROC
        y_true=y_onehot[:, i],
        y_pred=y_score[:, i],
        name=f"Religion {label} vs the rest",
        color="darkorange",
        plot_chance_level=True,
        despine=True,
        )
    _ = display.ax_.set(
        xlabel="False Positive Rate",
        ylabel="True Positive Rate"
    )
plt.show()
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

    Christian       0.98      0.94      0.96      2724
Non-Christian       0.42      0.60      0.49       317
 Unaffiliated       0.93      0.91      0.92      1449

     accuracy                           0.91      4490
    macro avg       0.77      0.82      0.79      4490
 weighted avg       0.92      0.91      0.91      4490

0.9060133630289532
-----------------------
CONFUSION MATRIX:
[[2565  149   10]
 [  35  191   91]
 [  18  119 1312]]
-----------------------
ROC CURVES:

USING SELECTED FOR PROTESTANT, CATHOLIC, MORMON, CHRISTIAN, JEWISH, MUSLIM, OTHER, UNAFFALIATED

Reading in the Data

Code
# reading in data
religion = pd.read_csv("../data/religion_full_currel.csv")

Dealing with Class Imbalance

Code
# visualizing imbalance
y = religion['CURREL']
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# dropping refused 
religion = religion[religion['CURREL'] != 900000]

# Christian, non-christian, unaffiliated
grouping_map = {
    1000: 'Protestant',
    10000: 'Catholic',
    20000: 'Mormon',
    30000: 'Christian',
    40001: 'Christian',
    40002: 'Christian',
    50000: 'Jewish',
    60000: 'Muslim',
    70000: 'Other Religion',
    80000: 'Other Religion',
    90001: 'Other Religion',
    90002: 'Other Religion',
    100000: 'Unaffiliated'
}

religion['CURREL_NEW'] = religion['CURREL'].map(grouping_map)

Code
# getting the x and y variables
X_int = religion.drop(columns=['RELTRAD', 'CURREL_NEW'])

# take currel
y = religion['CURREL_NEW']

# drop some rows for y
print(y.value_counts())

# checking shapes
print(y.shape)
print(X_int.shape)
CURREL_NEW
Protestant        8723
Unaffiliated      7355
Catholic          4074
Other Religion     963
Jewish             521
Mormon             362
Christian          283
Muslim             167
Name: count, dtype: int64
(22448,)
(22448, 100)
Code
selected_features = [0, 7, 8, 10, 11, 13, 14, 16, 17, 19, 21, 22, 24, 25, 26, 27, 29, 30, 32, 33, 35, 36, 37, 38, 44, 47, 50, 53, 54, 57, 60, 62, 63, 64, 65, 70, 71, 74, 75, 77, 80, 81, 82, 83, 85, 86, 88, 90, 91, 92]

X_int = X_int.iloc[:, selected_features]
print(X_int.shape)
(22448, 50)

Preprocessing

Code
# scaling X value
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_int)

# split data into test, train, validation
X_tmp, X_test, y_tmp, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=6600)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=6600)

# print
print("\nTRAIN")
print(X_train.shape)
print(y_train.shape)

print("\nVALIDATION")
print(X_val.shape)
print(y_val.shape)

print("\nTEST")
print(X_test.shape)
print(y_test.shape)

TRAIN
(14366, 50)
(14366,)

VALIDATION
(3592, 50)
(3592,)

TEST
(4490, 50)
(4490,)

Dealing with the Class Imbalances

Code
# checking the imbalance
sns.countplot(x=y)
plt.xticks(rotation=45)
plt.title("Class Distribution in CURREL")
plt.show()

# adding re-sampling to deal with class imabalance
smote = SMOTE(random_state=6600)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

Hyperparameter Tuning (reg_param)

Code
# initiating parameters
best_val = 0
opt_reg = None
val_scores = {}
reg_params = [0.0, 0.05, 0.1, 0.2, 0.5, 0.9]

for r in reg_params:
    qda_model = QuadraticDiscriminantAnalysis(reg_param=0.2)
    qda_model.fit(X_train_bal, y_train_bal)

    # getting the predictions
    y_val_pred = qda_model.predict(X_val)
    val_score = accuracy_score(y_val, y_val_pred)
    val_scores[r] = val_score

    # updating best reg value
    if val_score > best_val:
        best_val = val_score
        opt_reg = r

print("\nOptimal reg_param:", opt_reg)
print("Validation accuracy:", best_val)

# plotting
plt.plot(val_scores.keys(), val_scores.values(), marker='o')
plt.title("Validation Accuracy vs reg_param")
plt.xlabel("reg_param")
plt.ylabel("Validation Accuracy")
plt.grid(True)
plt.show()

Optimal reg_param: 0.0
Validation accuracy: 0.6620267260579065

Fitting the Model

Code
opt_reg = 0.2
qda_model = QuadraticDiscriminantAnalysis(reg_param=opt_reg) # iniating QDA
qda_model.fit(X_train_bal, y_train_bal)

# getting the predictions
y_train_pred = qda_model.predict(X_train)
y_test_pred = qda_model.predict(X_test)
y_val_pred = qda_model.predict(X_val)

Visualizing Results

Code
# classification report
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred, zero_division=0))
print(accuracy_score(y_test, y_test_pred))
print("-----------------------")

# confusion matrix
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_test_pred))
print("-----------------------")

# ROC curve
print("ROC CURVES:")
classes = qda_model.classes_ # getting classes
y_score = qda_model.predict_proba(X_test) # predictions
y_onehot = label_binarize(y_test, classes=classes)
for i, label in enumerate(classes): # plotting ROC for all classes of digits
    auc = roc_auc_score(y_onehot[:, i], y_score[:, i])
    display = RocCurveDisplay.from_predictions( # ROC
        y_true=y_onehot[:, i],
        y_pred=y_score[:, i],
        name=f"Religion {label} vs the rest",
        color="darkorange",
        plot_chance_level=True,
        despine=True,
        )
    _ = display.ax_.set(
        xlabel="False Positive Rate",
        ylabel="True Positive Rate"
    )
plt.show()
CLASSIFICATION REPORT:
                precision    recall  f1-score   support

      Catholic       0.60      0.62      0.61       815
     Christian       0.12      0.30      0.18        63
        Jewish       0.26      0.60      0.36       107
        Mormon       0.10      0.74      0.18        69
        Muslim       0.40      0.49      0.44        37
Other Religion       0.27      0.45      0.34       173
    Protestant       0.85      0.55      0.66      1777
  Unaffiliated       0.94      0.83      0.88      1449

      accuracy                           0.65      4490
     macro avg       0.44      0.57      0.46      4490
  weighted avg       0.77      0.65      0.69      4490

0.6478841870824054
-----------------------
CONFUSION MATRIX:
[[ 502   36   55   71    2    4  142    3]
 [   6   19    7    7    0    8   15    1]
 [  13    6   64    2    2    7    4    9]
 [   6    2    1   51    0    2    6    1]
 [   2    5    3    2   18    4    2    1]
 [   3    4   14    1   14   77    1   59]
 [ 289   71   82  359    3    0  971    2]
 [  14    9   23    4    6  181    5 1207]]
-----------------------
ROC CURVES: